import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
#contient les informations concernant l'inégalité des revenus dans le monde (119 pays) pour l'année 2019,
# il contient des informations concernant la part que les 1%, 10% et 50% de la population gagnent par rapport au revenu total de la population.
wid = pd.read_excel('WID_Data.xlsx')
wid
| Country | top_10_percent | bottom_50_percent | top_1_percent | |
|---|---|---|---|---|
| 0 | Afghanistan | 0.4204 | NaN | NaN |
| 1 | Afghanistan | NaN | 0.1873 | NaN |
| 2 | Afghanistan | NaN | NaN | 0.1478 |
| 3 | Albania | 0.3395 | NaN | NaN |
| 4 | Albania | NaN | 0.1931 | NaN |
| ... | ... | ... | ... | ... |
| 352 | Zambia | NaN | 0.0731 | NaN |
| 353 | Zambia | NaN | NaN | 0.2308 |
| 354 | Zimbabwe | 0.5198 | NaN | NaN |
| 355 | Zimbabwe | NaN | 0.1225 | NaN |
| 356 | Zimbabwe | NaN | NaN | 0.1729 |
357 rows × 4 columns
#Indice de GII : estimé selon trois dimensions : la santé reproductive des femmes, leur autonomisation et le marché du travail
# Compris entre 0 et 1, plus sa valeur est élevée, plus le pays est inégalitaire entre hommes et femmes
gii = pd.read_excel('GII.xlsx')
gii
| Country | Gender Inequality Index | |
|---|---|---|
| 0 | Norway | 0.045 |
| 1 | Ireland | 0.093 |
| 2 | Switzerland | 0.025 |
| 3 | Iceland | 0.058 |
| 4 | Germany | 0.084 |
| ... | ... | ... |
| 157 | Mali | 0.671 |
| 158 | Burundi | 0.504 |
| 159 | Chad | 0.710 |
| 160 | Central African Republic | 0.680 |
| 161 | Niger | 0.642 |
162 rows × 2 columns
#Il semble que pour chaque pays il y ait 3 lignes (Une pour chaque pourcentage)
# On veut avoir une ligne par pays avec l'alignement des pourcentages à la suite.
wid.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 357 entries, 0 to 356 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country 357 non-null object 1 top_10_percent 119 non-null float64 2 bottom_50_percent 119 non-null float64 3 top_1_percent 119 non-null float64 dtypes: float64(3), object(1) memory usage: 11.3+ KB
wid.describe()
| top_10_percent | bottom_50_percent | top_1_percent | |
|---|---|---|---|
| count | 119.000000 | 119.000000 | 119.000000 |
| mean | 0.446123 | 0.162753 | 0.158431 |
| std | 0.090202 | 0.045189 | 0.050093 |
| min | 0.283800 | 0.062700 | 0.069700 |
| 25% | 0.366000 | 0.126500 | 0.115800 |
| 50% | 0.449400 | 0.166800 | 0.151500 |
| 75% | 0.497400 | 0.193350 | 0.199000 |
| max | 0.650800 | 0.262400 | 0.309100 |
#On crée un Dataframe Avec seulement les 10%
top_10_percent=wid.copy()
top_10_percent.drop(['bottom_50_percent', 'top_1_percent'], axis = 1, inplace = True)
top_10_percent= top_10_percent.dropna(subset=['top_10_percent'])
top_10_percent
| Country | top_10_percent | |
|---|---|---|
| 0 | Afghanistan | 0.4204 |
| 3 | Albania | 0.3395 |
| 6 | Algeria | 0.3729 |
| 9 | Angola | 0.5771 |
| 12 | Argentina | 0.3980 |
| ... | ... | ... |
| 342 | Venezuela | 0.4851 |
| 345 | Viet Nam | 0.4260 |
| 348 | Yemen | 0.4820 |
| 351 | Zambia | 0.6149 |
| 354 | Zimbabwe | 0.5198 |
119 rows × 2 columns
# #On crée un Dataframe Avec seulement les 50%
bottom_50_percent=wid.copy()
bottom_50_percent.drop(['top_10_percent', 'top_1_percent'], axis = 1, inplace = True)
bottom_50_percent= bottom_50_percent.dropna(subset=['bottom_50_percent'])
bottom_50_percent
| Country | bottom_50_percent | |
|---|---|---|
| 1 | Afghanistan | 0.1873 |
| 4 | Albania | 0.1931 |
| 7 | Algeria | 0.2071 |
| 10 | Angola | 0.0951 |
| 13 | Argentina | 0.1782 |
| ... | ... | ... |
| 343 | Venezuela | 0.1265 |
| 346 | Viet Nam | 0.1809 |
| 349 | Yemen | 0.1521 |
| 352 | Zambia | 0.0731 |
| 355 | Zimbabwe | 0.1225 |
119 rows × 2 columns
# #On crée un Dataframe Avec seulement les 1%
top_1_percent=wid.copy()
top_1_percent.drop(['top_10_percent', 'bottom_50_percent'], axis = 1, inplace = True)
top_1_percent= top_1_percent.dropna(subset=['top_1_percent'])
top_1_percent
| Country | top_1_percent | |
|---|---|---|
| 2 | Afghanistan | 0.1478 |
| 5 | Albania | 0.0896 |
| 8 | Algeria | 0.0970 |
| 11 | Angola | 0.2584 |
| 14 | Argentina | 0.1474 |
| ... | ... | ... |
| 344 | Venezuela | 0.2021 |
| 347 | Viet Nam | 0.1515 |
| 350 | Yemen | 0.1487 |
| 353 | Zambia | 0.2308 |
| 356 | Zimbabwe | 0.1729 |
119 rows × 2 columns
#On a bien 119 Rows dans chacun des 3 dataframe
#It's Time to merge. On fait une jointure des 3 dataframes
# Jointure des bottom_50 avec top_10_percent
Step1=pd.merge(bottom_50_percent,top_10_percent, on = ['Country'])
Step1
| Country | bottom_50_percent | top_10_percent | |
|---|---|---|---|
| 0 | Afghanistan | 0.1873 | 0.4204 |
| 1 | Albania | 0.1931 | 0.3395 |
| 2 | Algeria | 0.2071 | 0.3729 |
| 3 | Angola | 0.0951 | 0.5771 |
| 4 | Argentina | 0.1782 | 0.3980 |
| ... | ... | ... | ... |
| 114 | Venezuela | 0.1265 | 0.4851 |
| 115 | Viet Nam | 0.1809 | 0.4260 |
| 116 | Yemen | 0.1521 | 0.4820 |
| 117 | Zambia | 0.0731 | 0.6149 |
| 118 | Zimbabwe | 0.1225 | 0.5198 |
119 rows × 3 columns
# 2ême jointure de step1 avec le top_1_percent
wid1=pd.merge(Step1,top_1_percent, on = ['Country'])
# On supprime les espaces entre les chaines de caractères
wid1.replace(" ", "")
# rajout de la colonne country et conversion en chaine de caractère
wid1['Country']=gii['Country'].astype(str)
wid1.describe()
#Au maximum les 50% les moins riches possèdent 26% des richesses du pays
| bottom_50_percent | top_10_percent | top_1_percent | |
|---|---|---|---|
| count | 119.000000 | 119.000000 | 119.000000 |
| mean | 0.162753 | 0.446123 | 0.158431 |
| std | 0.045189 | 0.090202 | 0.050093 |
| min | 0.062700 | 0.283800 | 0.069700 |
| 25% | 0.126500 | 0.366000 | 0.115800 |
| 50% | 0.166800 | 0.449400 | 0.151500 |
| 75% | 0.193350 | 0.497400 | 0.199000 |
| max | 0.262400 | 0.650800 | 0.309100 |
wid1.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 119 entries, 0 to 118 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country 119 non-null object 1 bottom_50_percent 119 non-null float64 2 top_10_percent 119 non-null float64 3 top_1_percent 119 non-null float64 dtypes: float64(3), object(1) memory usage: 4.6+ KB
#vérifions maintenant l'accurancy des données
#Ici il faudrait créer une colonne test qui somme les 3 valeurs. SI cette colonne = 1 alors = OK sinon KO
gii.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 162 entries, 0 to 161 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country 162 non-null object 1 Gender Inequality Index 162 non-null float64 dtypes: float64(1), object(1) memory usage: 2.7+ KB
# Trier le dataframe gii par ordre croissant, suppression d'espace dans les chaines de caractères et conversion du la colonne country en chaine de caractère
gii=gii.sort_values(by = 'Country', ascending = True)
gii.replace(" ", "")
gii['Country']=gii['Country'].astype(str)
# Jointure du dataframe wid1 avec le datafame gii avec la clé commune Country. Tri par ordre croissant de 'Gender Inequality Index'
merged=pd.merge(wid1,gii,on='Country', how = 'left')
merged=merged.sort_values(by = 'Gender Inequality Index', ascending = True)
merged.head(16)
| Country | bottom_50_percent | top_10_percent | top_1_percent | Gender Inequality Index | |
|---|---|---|---|---|---|
| 2 | Switzerland | 0.2071 | 0.3729 | 0.0970 | 0.025 |
| 8 | Denmark | 0.1265 | 0.4851 | 0.2021 | 0.038 |
| 5 | Sweden | 0.2025 | 0.3449 | 0.1147 | 0.039 |
| 12 | Belgium | 0.2146 | 0.3173 | 0.0849 | 0.043 |
| 7 | Netherlands | 0.2448 | 0.3196 | 0.0923 | 0.043 |
| 0 | Norway | 0.1873 | 0.4204 | 0.1478 | 0.045 |
| 9 | Finland | 0.1675 | 0.5012 | 0.1737 | 0.047 |
| 23 | France | 0.1006 | 0.6020 | 0.2784 | 0.049 |
| 3 | Iceland | 0.0951 | 0.5771 | 0.2584 | 0.058 |
| 19 | Slovenia | 0.1130 | 0.5166 | 0.1573 | 0.063 |
| 20 | Korea (Republic of) | 0.1836 | 0.3936 | 0.1430 | 0.064 |
| 10 | Singapore | 0.1971 | 0.4171 | 0.1577 | 0.065 |
| 21 | Luxembourg | 0.0803 | 0.6463 | 0.3086 | 0.065 |
| 27 | Italy | 0.0946 | 0.5154 | 0.2079 | 0.069 |
| 16 | Austria | 0.1029 | 0.5733 | 0.2765 | 0.069 |
| 22 | Spain | 0.1295 | 0.4893 | 0.1559 | 0.070 |
merged.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 119 entries, 2 to 109 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country 119 non-null object 1 bottom_50_percent 119 non-null float64 2 top_10_percent 119 non-null float64 3 top_1_percent 119 non-null float64 4 Gender Inequality Index 119 non-null float64 dtypes: float64(4), object(1) memory usage: 5.6+ KB
#Vérifions si nous avons des N/A
merged[merged['Gender Inequality Index'].isnull()]
| Country | bottom_50_percent | top_10_percent | top_1_percent | Gender Inequality Index |
|---|
merged[merged['bottom_50_percent'].isnull()]
| Country | bottom_50_percent | top_10_percent | top_1_percent | Gender Inequality Index |
|---|
merged[merged['top_10_percent'].isnull()]
| Country | bottom_50_percent | top_10_percent | top_1_percent | Gender Inequality Index |
|---|
merged[merged['top_1_percent'].isnull()]
| Country | bottom_50_percent | top_10_percent | top_1_percent | Gender Inequality Index |
|---|
#Notre dataframe est complet !
Après un tri asc sur le Gii, nous créons 3 catégories TOP15 = 15 pays les mieux notés MEDIUM = le reste de pays FLOP15 = 15 pays les moins bien notés
merged['GII_Ranking'] = pd.Series()
merged
merged['INEQUALITY_Ranking'] = pd.Series()
merged
<ipython-input-24-d5fb58f2ef86>:1: DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning. merged['GII_Ranking'] = pd.Series() <ipython-input-24-d5fb58f2ef86>:3: DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning. merged['INEQUALITY_Ranking'] = pd.Series()
| Country | bottom_50_percent | top_10_percent | top_1_percent | Gender Inequality Index | GII_Ranking | INEQUALITY_Ranking | |
|---|---|---|---|---|---|---|---|
| 2 | Switzerland | 0.2071 | 0.3729 | 0.0970 | 0.025 | NaN | NaN |
| 8 | Denmark | 0.1265 | 0.4851 | 0.2021 | 0.038 | NaN | NaN |
| 5 | Sweden | 0.2025 | 0.3449 | 0.1147 | 0.039 | NaN | NaN |
| 12 | Belgium | 0.2146 | 0.3173 | 0.0849 | 0.043 | NaN | NaN |
| 7 | Netherlands | 0.2448 | 0.3196 | 0.0923 | 0.043 | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 113 | Guatemala | 0.1692 | 0.4494 | 0.1599 | 0.479 | NaN | NaN |
| 97 | Indonesia | 0.2186 | 0.3424 | 0.1220 | 0.480 | NaN | NaN |
| 117 | India | 0.0731 | 0.6149 | 0.2308 | 0.488 | NaN | NaN |
| 105 | Gabon | 0.1320 | 0.5312 | 0.2024 | 0.525 | NaN | NaN |
| 109 | Iraq | 0.2025 | 0.3449 | 0.1147 | 0.577 | NaN | NaN |
119 rows × 7 columns
# Alimentation de la colonne 'GII_Ranking avec les conditions emises sur les valeurs du 'Gender Inequality'
conditionlist = [(merged['Gender Inequality Index'] <=0.069) ,
(merged['Gender Inequality Index'] >= 0.070) & (merged['Gender Inequality Index'] <0.440),
(merged['Gender Inequality Index'] >= 0.440)]
choicelist = ['TOP15', 'MID', 'FLOP15']
merged['GII_Ranking'] = np.select(conditionlist, choicelist, default='Not Specified')
merged.head(16)
| Country | bottom_50_percent | top_10_percent | top_1_percent | Gender Inequality Index | GII_Ranking | INEQUALITY_Ranking | |
|---|---|---|---|---|---|---|---|
| 2 | Switzerland | 0.2071 | 0.3729 | 0.0970 | 0.025 | TOP15 | NaN |
| 8 | Denmark | 0.1265 | 0.4851 | 0.2021 | 0.038 | TOP15 | NaN |
| 5 | Sweden | 0.2025 | 0.3449 | 0.1147 | 0.039 | TOP15 | NaN |
| 12 | Belgium | 0.2146 | 0.3173 | 0.0849 | 0.043 | TOP15 | NaN |
| 7 | Netherlands | 0.2448 | 0.3196 | 0.0923 | 0.043 | TOP15 | NaN |
| 0 | Norway | 0.1873 | 0.4204 | 0.1478 | 0.045 | TOP15 | NaN |
| 9 | Finland | 0.1675 | 0.5012 | 0.1737 | 0.047 | TOP15 | NaN |
| 23 | France | 0.1006 | 0.6020 | 0.2784 | 0.049 | TOP15 | NaN |
| 3 | Iceland | 0.0951 | 0.5771 | 0.2584 | 0.058 | TOP15 | NaN |
| 19 | Slovenia | 0.1130 | 0.5166 | 0.1573 | 0.063 | TOP15 | NaN |
| 20 | Korea (Republic of) | 0.1836 | 0.3936 | 0.1430 | 0.064 | TOP15 | NaN |
| 10 | Singapore | 0.1971 | 0.4171 | 0.1577 | 0.065 | TOP15 | NaN |
| 21 | Luxembourg | 0.0803 | 0.6463 | 0.3086 | 0.065 | TOP15 | NaN |
| 27 | Italy | 0.0946 | 0.5154 | 0.2079 | 0.069 | TOP15 | NaN |
| 16 | Austria | 0.1029 | 0.5733 | 0.2765 | 0.069 | TOP15 | NaN |
| 22 | Spain | 0.1295 | 0.4893 | 0.1559 | 0.070 | MID | NaN |
conditionlist = [(merged['top_1_percent'] <=) , (merged['Gender Inequality Index'] >= 0.070) & (merged['Gender Inequality Index'] <0.440), (merged['Gender Inequality Index'] >= 0.440)] choicelist = ['TOP15', 'MID', 'FLOP15'] merged[INEQUALITY_Ranking'] = np.select(conditionlist, choicelist, default='Not Specified') merged.head(16)
merged['Ratio']=merged['Gender Inequality Index']*merged['top_10_percent']*100
merged.head()
| Country | bottom_50_percent | top_10_percent | top_1_percent | Gender Inequality Index | GII_Ranking | INEQUALITY_Ranking | Ratio | |
|---|---|---|---|---|---|---|---|---|
| 2 | Switzerland | 0.2071 | 0.3729 | 0.0970 | 0.025 | TOP15 | NaN | 0.93225 |
| 8 | Denmark | 0.1265 | 0.4851 | 0.2021 | 0.038 | TOP15 | NaN | 1.84338 |
| 5 | Sweden | 0.2025 | 0.3449 | 0.1147 | 0.039 | TOP15 | NaN | 1.34511 |
| 12 | Belgium | 0.2146 | 0.3173 | 0.0849 | 0.043 | TOP15 | NaN | 1.36439 |
| 7 | Netherlands | 0.2448 | 0.3196 | 0.0923 | 0.043 | TOP15 | NaN | 1.37428 |
merged['Gender Inequality Index'].plot(kind='box')
<AxesSubplot:>
px.scatter(data_frame=merged, x="bottom_50_percent", y="top_10_percent", color="GII_Ranking",hover_name='Country', size="top_1_percent")
px.bar(data_frame=merged, x="bottom_50_percent", y='Country',color="GII_Ranking")
fig = px.bar(merged, x="Country", y="bottom_50_percent", color="GII_Ranking", title="Long-Form Input")
fig.show()
correlation_matrix = merged.corr().round(2)
plt.figure(figsize=(16, 10), dpi=100)
plt.title("Matrice de corrélation")
sns.heatmap(correlation_matrix, cmap="RdBu_r", center=0.0, annot=True)
plt.show()
continent= pd.read_csv('countryContinent.csv',encoding = 'Ansi')
continent.head()
| country | code_2 | code_3 | country_code | iso_3166_2 | continent | sub_region | region_code | sub_region_code | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | AF | AFG | 4 | ISO 3166-2:AF | Asia | Southern Asia | 142.0 | 34.0 |
| 1 | Åland Islands | AX | ALA | 248 | ISO 3166-2:AX | Europe | Northern Europe | 150.0 | 154.0 |
| 2 | Albania | AL | ALB | 8 | ISO 3166-2:AL | Europe | Southern Europe | 150.0 | 39.0 |
| 3 | Algeria | DZ | DZA | 12 | ISO 3166-2:DZ | Africa | Northern Africa | 2.0 | 15.0 |
| 4 | American Samoa | AS | ASM | 16 | ISO 3166-2:AS | Oceania | Polynesia | 9.0 | 61.0 |
# Importation des tableaux de la page html avec les pays et les localisation en latitude et longitude
df= pd.read_html('https://developers.google.com/public-data/docs/canonical/countries_csv')
df
[ country latitude longitude name 0 AD 42.546245 1.601554 Andorra 1 AE 23.424076 53.847818 United Arab Emirates 2 AF 33.939110 67.709953 Afghanistan 3 AG 17.060816 -61.796428 Antigua and Barbuda 4 AI 18.220554 -63.068615 Anguilla .. ... ... ... ... 240 YE 15.552727 48.516388 Yemen 241 YT -12.827500 45.166244 Mayotte 242 ZA -30.559482 22.937506 South Africa 243 ZM -13.133897 27.849332 Zambia 244 ZW -19.015438 29.154857 Zimbabwe [245 rows x 4 columns]]
# transformation de df en un dataframe df2 que l'on pourra manipuler dans pandas
df2 = df[0]
df2
| country | latitude | longitude | name | |
|---|---|---|---|---|
| 0 | AD | 42.546245 | 1.601554 | Andorra |
| 1 | AE | 23.424076 | 53.847818 | United Arab Emirates |
| 2 | AF | 33.939110 | 67.709953 | Afghanistan |
| 3 | AG | 17.060816 | -61.796428 | Antigua and Barbuda |
| 4 | AI | 18.220554 | -63.068615 | Anguilla |
| ... | ... | ... | ... | ... |
| 240 | YE | 15.552727 | 48.516388 | Yemen |
| 241 | YT | -12.827500 | 45.166244 | Mayotte |
| 242 | ZA | -30.559482 | 22.937506 | South Africa |
| 243 | ZM | -13.133897 | 27.849332 | Zambia |
| 244 | ZW | -19.015438 | 29.154857 | Zimbabwe |
245 rows × 4 columns
countryLocalisation2 = pd.read_csv('countryLocalisation.csv')
countryLocalisation2.head()
| Unnamed: 0 | country | latitude | longitude | name | |
|---|---|---|---|---|---|
| 0 | 0 | AD | 42.546245 | 1.601554 | Andorra |
| 1 | 1 | AE | 23.424076 | 53.847818 | United Arab Emirates |
| 2 | 2 | AF | 33.939110 | 67.709953 | Afghanistan |
| 3 | 3 | AG | 17.060816 | -61.796428 | Antigua and Barbuda |
| 4 | 4 | AI | 18.220554 | -63.068615 | Anguilla |
#### renomme la colonne 'country' de countryLocalisation2 en 'code_2' pour pouvoir faire la jointure avec le dataframe 'continent'
countryLocalisation2 = countryLocalisation2.rename(columns = {'country' : 'code_2'})
countryLocalisation2.head()
| Unnamed: 0 | code_2 | latitude | longitude | name | |
|---|---|---|---|---|---|
| 0 | 0 | AD | 42.546245 | 1.601554 | Andorra |
| 1 | 1 | AE | 23.424076 | 53.847818 | United Arab Emirates |
| 2 | 2 | AF | 33.939110 | 67.709953 | Afghanistan |
| 3 | 3 | AG | 17.060816 | -61.796428 | Antigua and Barbuda |
| 4 | 4 | AI | 18.220554 | -63.068615 | Anguilla |
# Jointure des dataframe countryLocalisation2 et continent. clé commune 'code_2'
Localisation = pd.merge(continent, countryLocalisation2, on='code_2', how = 'left')
Localisation.head()
| country | code_2 | code_3 | country_code | iso_3166_2 | continent | sub_region | region_code | sub_region_code | Unnamed: 0 | latitude | longitude | name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | AF | AFG | 4 | ISO 3166-2:AF | Asia | Southern Asia | 142.0 | 34.0 | 2.0 | 33.939110 | 67.709953 | Afghanistan |
| 1 | Åland Islands | AX | ALA | 248 | ISO 3166-2:AX | Europe | Northern Europe | 150.0 | 154.0 | NaN | NaN | NaN | NaN |
| 2 | Albania | AL | ALB | 8 | ISO 3166-2:AL | Europe | Southern Europe | 150.0 | 39.0 | 5.0 | 41.153332 | 20.168331 | Albania |
| 3 | Algeria | DZ | DZA | 12 | ISO 3166-2:DZ | Africa | Northern Africa | 2.0 | 15.0 | 58.0 | 28.033886 | 1.659626 | Algeria |
| 4 | American Samoa | AS | ASM | 16 | ISO 3166-2:AS | Oceania | Polynesia | 9.0 | 61.0 | 11.0 | -14.270972 | -170.132217 | American Samoa |
# Garder que les colonnes intérressantes pour la visualisation Country, code_2, Continent,latitude et longitude, sub_region
#df = df[['year', 'name']]
Localisation2 = Localisation[['country', 'continent', 'latitude', 'longitude', 'sub_region', 'code_2']]
Localisation2.head()
| country | continent | latitude | longitude | sub_region | code_2 | |
|---|---|---|---|---|---|---|
| 0 | Afghanistan | Asia | 33.939110 | 67.709953 | Southern Asia | AF |
| 1 | Åland Islands | Europe | NaN | NaN | Northern Europe | AX |
| 2 | Albania | Europe | 41.153332 | 20.168331 | Southern Europe | AL |
| 3 | Algeria | Africa | 28.033886 | 1.659626 | Northern Africa | DZ |
| 4 | American Samoa | Oceania | -14.270972 | -170.132217 | Polynesia | AS |
# suppression des espaces entre les carcatères
Localisation2.replace(" ", "")
merged['Country'] = merged['Country'].str.replace(r"\(.*\)","")
# Jointure du dataframe merged et Localisation2
Localisation2 = Localisation2.rename(columns = {'country' : 'Country'})
countryFinal = pd.merge(merged,Localisation2,on='Country', how = 'left')
countryFinal =countryFinal.drop(columns = 'INEQUALITY_Ranking')
countryFinal
| Country | bottom_50_percent | top_10_percent | top_1_percent | Gender Inequality Index | GII_Ranking | Ratio | continent | latitude | longitude | sub_region | code_2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Switzerland | 0.2071 | 0.3729 | 0.0970 | 0.025 | TOP15 | 0.93225 | Europe | 46.818188 | 8.227512 | Western Europe | CH |
| 1 | Denmark | 0.1265 | 0.4851 | 0.2021 | 0.038 | TOP15 | 1.84338 | Europe | 56.263920 | 9.501785 | Northern Europe | DK |
| 2 | Sweden | 0.2025 | 0.3449 | 0.1147 | 0.039 | TOP15 | 1.34511 | Europe | 60.128161 | 18.643501 | Northern Europe | SE |
| 3 | Belgium | 0.2146 | 0.3173 | 0.0849 | 0.043 | TOP15 | 1.36439 | Europe | 50.503887 | 4.469936 | Western Europe | BE |
| 4 | Netherlands | 0.2448 | 0.3196 | 0.0923 | 0.043 | TOP15 | 1.37428 | Europe | 52.132633 | 5.291266 | Western Europe | NL |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 114 | Guatemala | 0.1692 | 0.4494 | 0.1599 | 0.479 | FLOP15 | 21.52626 | Americas | 15.783471 | -90.230759 | Central America | GT |
| 115 | Indonesia | 0.2186 | 0.3424 | 0.1220 | 0.480 | FLOP15 | 16.43520 | Asia | -0.789275 | 113.921327 | South-Eastern Asia | ID |
| 116 | India | 0.0731 | 0.6149 | 0.2308 | 0.488 | FLOP15 | 30.00712 | Asia | 20.593684 | 78.962880 | Southern Asia | IN |
| 117 | Gabon | 0.1320 | 0.5312 | 0.2024 | 0.525 | FLOP15 | 27.88800 | Africa | -0.803689 | 11.609444 | Middle Africa | GA |
| 118 | Iraq | 0.2025 | 0.3449 | 0.1147 | 0.577 | FLOP15 | 19.90073 | Asia | 33.223191 | 43.679291 | Western Asia | IQ |
119 rows × 12 columns
# ignorer les valeurs manquantes dans la colonne 'Continent'
countryFinal[countryFinal['continent'].isnull()]
| Country | bottom_50_percent | top_10_percent | top_1_percent | Gender Inequality Index | GII_Ranking | Ratio | continent | latitude | longitude | sub_region | code_2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10 | Korea | 0.1836 | 0.3936 | 0.1430 | 0.064 | TOP15 | 2.51904 | NaN | NaN | NaN | NaN | NaN |
| 31 | United Kingdom | 0.2025 | 0.3449 | 0.1147 | 0.118 | MID | 4.06982 | NaN | NaN | NaN | NaN | NaN |
| 35 | Czechia | 0.1484 | 0.4142 | 0.1392 | 0.136 | MID | 5.63312 | NaN | NaN | NaN | NaN | NaN |
| 36 | North Macedonia | 0.1901 | 0.3943 | 0.1456 | 0.143 | MID | 5.63849 | NaN | NaN | NaN | NaN | NaN |
| 45 | Moldova | 0.2565 | 0.3090 | 0.1044 | 0.204 | MID | 6.30360 | NaN | NaN | NaN | NaN | NaN |
| 46 | United States | 0.0870 | 0.5889 | 0.2259 | 0.204 | MID | 12.01356 | NaN | NaN | NaN | NaN | NaN |
| 96 | Bolivia | 0.0627 | 0.6508 | 0.1921 | 0.417 | MID | 27.13836 | NaN | NaN | NaN | NaN | NaN |
| 110 | Iran | 0.1917 | 0.4492 | 0.1414 | 0.459 | FLOP15 | 20.61828 | NaN | NaN | NaN | NaN | NaN |
| 113 | Venezuela | 0.2564 | 0.2884 | 0.0896 | 0.479 | FLOP15 | 13.81436 | NaN | NaN | NaN | NaN | NaN |
Graph1=countryFinal.groupby(['continent']).mean()
Graph1
| bottom_50_percent | top_10_percent | top_1_percent | Gender Inequality Index | Ratio | latitude | longitude | |
|---|---|---|---|---|---|---|---|
| continent | |||||||
| Africa | 0.167891 | 0.445382 | 0.162664 | 0.405455 | 18.393174 | 5.988414 | 14.854229 |
| Americas | 0.164754 | 0.446831 | 0.158165 | 0.366115 | 16.295047 | 5.850731 | -74.639993 |
| Asia | 0.167842 | 0.442864 | 0.156867 | 0.293212 | 13.156397 | 26.498127 | 71.883451 |
| Europe | 0.154520 | 0.450971 | 0.160714 | 0.112486 | 4.985976 | 49.480633 | 16.525249 |
| Oceania | 0.142260 | 0.468000 | 0.170840 | 0.260800 | 12.880676 | -23.538233 | 28.154530 |
fig = px.bar(countryFinal, x="continent", y="bottom_50_percent", color="GII_Ranking", title="TEST", hover_name='Country')
fig.show()
Rappel : Le dataset wid contenait les informations concernant l'inégalité des revenus dans le monde (119 pays) pour l'année 2019, Contenait des informations concernant la part que les 1%, 10% et 50% de la population gagnent par rapport au revenu total de la population. Le dataset GII : estimé selon trois dimensions : la santé reproductive des femmes, leur autonomisation et le marché du travail Compris entre 0 et 1, plus sa valeur est élevée, plus le pays est inégalitaire entre hommes et femmes.
Nous avons ajouté une mesure contenant : TOP15 = 15 pays les mieux notés MEDIUM = le reste de pays FLOP15 = 15 pays les moins bien notés
# Visualisation cartographique.
fig = px.scatter_mapbox(countryFinal, lat='latitude', lon='longitude', hover_name="Country", hover_data=['bottom_50_percent','top_10_percent'],
color="GII_Ranking",size="Ratio", zoom=0, height=300)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()